"""
PDF text extraction service.

Provides efficient PDF text extraction with single-pass processing.
Complements pdf_service.py which handles PDF generation.
"""

import io
from typing import Dict, List

import pdfplumber
from loguru import logger


class PDFExtractionService:
    """Service for extracting text and metadata from PDF files."""

    @staticmethod
    def extract_text_and_metadata(
        pdf_content: bytes, filename: str
    ) -> Dict[str, any]:
        """
        Extract text and metadata from PDF in a single pass.

        This method opens the PDF only once and extracts both text content
        and metadata (page count) in the same operation, avoiding the
        performance issue of opening the file multiple times.

        Args:
            pdf_content: Raw PDF file bytes
            filename: Original filename (for logging)

        Returns:
            Dictionary with keys:
            - 'text': Extracted text content
            - 'pages': Number of pages
            - 'size': File size in bytes
            - 'filename': Original filename
            - 'success': Boolean indicating success
            - 'error': Error message if failed (None if successful)

        Raises:
            No exceptions - errors are captured in return dict
        """
        try:
            with pdfplumber.open(io.BytesIO(pdf_content)) as pdf:
                # Get pages list once
                pages = list(pdf.pages)
                page_count = len(pages)

                # Extract text from all pages in single pass
                text_parts = []
                for page in pages:
                    page_text = page.extract_text()
                    if page_text:
                        text_parts.append(page_text)

                # Combine all text
                full_text = "\n".join(text_parts)

                # Check if any text was extracted
                if not full_text.strip():
                    logger.warning(f"No extractable text found in {filename}")
                    return {
                        "text": "",
                        "pages": page_count,
                        "size": len(pdf_content),
                        "filename": filename,
                        "success": False,
                        "error": "No extractable text found",
                    }

                logger.info(
                    f"Successfully extracted text from {filename} "
                    f"({len(full_text)} chars, {page_count} pages)"
                )

                return {
                    "text": full_text.strip(),
                    "pages": page_count,
                    "size": len(pdf_content),
                    "filename": filename,
                    "success": True,
                    "error": None,
                }

        except Exception:
            # Log full exception details server-side for debugging
            logger.exception(f"Error extracting text from {filename}")
            # Return generic error message to avoid exposing internal details
            return {
                "text": "",
                "pages": 0,
                "size": len(pdf_content),
                "filename": filename,
                "success": False,
                "error": "Failed to extract text from PDF",
            }

    @staticmethod
    def extract_batch(files_data: List[Dict[str, any]]) -> Dict[str, any]:
        """
        Extract text from multiple PDF files.

        Args:
            files_data: List of dicts with 'content' (bytes) and 'filename' (str)

        Returns:
            Dictionary with:
            - 'results': List of extraction results
            - 'total_files': Total number of files processed
            - 'successful': Number of successfully processed files
            - 'failed': Number of failed files
            - 'errors': List of error messages
        """
        results = []
        successful = 0
        failed = 0
        errors = []

        for file_data in files_data:
            result = PDFExtractionService.extract_text_and_metadata(
                file_data["content"], file_data["filename"]
            )

            results.append(result)

            if result["success"]:
                successful += 1
            else:
                failed += 1
                errors.append(f"{file_data['filename']}: {result['error']}")

        return {
            "results": results,
            "total_files": len(files_data),
            "successful": successful,
            "failed": failed,
            "errors": errors,
        }


# Singleton pattern for service
_pdf_extraction_service = None


def get_pdf_extraction_service() -> PDFExtractionService:
    """Get the singleton PDF extraction service instance."""
    global _pdf_extraction_service
    if _pdf_extraction_service is None:
        _pdf_extraction_service = PDFExtractionService()
    return _pdf_extraction_service
